{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from pandas.tseries.offsets import *\n",
    "from tqdm import tqdm\n",
    "from functools import reduce\n",
    "from sklearnex import patch_sklearn\n",
    "patch_sklearn()\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.linear_model import LinearRegression\n",
    "import statsmodels.api as sm\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratio_chars = ['CAPEI', 'bm',\n",
    "       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',\n",
    "       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',\n",
    "       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',\n",
    "       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',\n",
    "       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',\n",
    "       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',\n",
    "       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',\n",
    "       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',\n",
    "       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',\n",
    "       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',\n",
    "       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',\n",
    "       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',\n",
    "       'divyield']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RF with look-ahead bias"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Rolling Window:\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "num_trees = 2000\n",
    "n_jobs = 12\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q2','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q3','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y2','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forecast_all = pd.concat(forecast,axis=0).reset_index()\n",
    "forecast_all.to_parquet('../data/Results/RF_with_lookahead_raw_005.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RF without look-ahead bias"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "##############################\n",
    "## Difference in predictors ##\n",
    "##############################\n",
    "# Q1: same\n",
    "# Q2: 'EPS_true_l1_q2' --> 'EPS_true_l1_q1'\n",
    "# Q3: 'EPS_true_l1_q3' --> 'EPS_true_l1_q1'\n",
    "# Y1: same\n",
    "# Y2: 'EPS_true_l1_y2' --> 'EPS_true_l1_y1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Rolling Window:\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=0.05,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forecast_all = pd.concat(forecast,axis=0).reset_index()\n",
    "forecast_all.to_parquet('../data/Results/RF_wo_lookahead_raw_005.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hughes et al.(2008)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "###  Characteristics ###\n",
    "\n",
    "# acc, dPPE, dOLA, sg_5y from Compustat\n",
    "compa = pd.read_parquet('../data/WRDS/compa.parquet')\n",
    "compa['gvkey'] = compa['gvkey'].astype(float)\n",
    "compa['datadate'] = compa['datadate'] + MonthEnd(0)\n",
    "compa = compa[compa['at_avg'] > 0].copy()\n",
    "compa['acc'] = compa['acc']/compa['at_avg']\n",
    "compa['dPPE'] = compa['ppegt_diff']/compa['at_avg']\n",
    "compa['dOLA'] = compa['ao_diff']/compa['at_avg']\n",
    "compa['sg_5y'] = np.nan_to_num(compa['sg_5y'], nan=np.nan, posinf=np.nan, neginf=np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# LTG from IBES\n",
    "# consensus = pd.read_parquet('../data/WRDS/EPS_summary.parquet')\n",
    "# consensus['YearMonth'] = consensus.statpers + MonthEnd(0)\n",
    "# consensus_ltg = consensus[consensus.fpi == '0'].copy()\n",
    "# consensus_ltg['LTG'] = consensus_ltg['meanest']\n",
    "# consensus_ltg = consensus_ltg[['ticker','YearMonth','LTG']]\n",
    "\n",
    "consensus = pd.read_parquet('../data/WRDS/Forecast_EPS_summary_unadjusted_2023.parquet')\n",
    "consensus.columns = consensus.columns.str.lower()\n",
    "consensus['statpers'] = pd.to_datetime(consensus['statpers'], format='%Y-%m-%d')\n",
    "consensus['YearMonth'] = consensus['statpers'] + MonthEnd(0)\n",
    "consensus_ltg = consensus[consensus['fpi'] == '0'].copy()\n",
    "consensus_ltg['LTG'] = consensus_ltg['meanest']\n",
    "consensus_ltg = consensus_ltg[['ticker','YearMonth','LTG']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Forecast Revision from IBES\n",
    "\n",
    "# Analyst Forecast Earnings:\n",
    "crsp = pd.read_parquet('../data/WRDS/crsp_m.parquet')\n",
    "crsp['ME'] = abs(crsp['prc']) * crsp['shrout']\n",
    "consensus = consensus.merge(crsp[['permno','ncusip','YearMonth','shrout']],\n",
    "                            left_on=['cusip','YearMonth'],\n",
    "                            right_on=['ncusip','YearMonth'])\n",
    "consensus['AF'] = consensus['meanest'] * consensus['shrout']\n",
    "\n",
    "## Revision for each Annual horizon\n",
    "result = []\n",
    "for i in [1,2]:\n",
    "    consensus_f1 = consensus[consensus.fpi=='{}'.format(i)].copy()\n",
    "    consensus_f1.drop_duplicates(subset=['YearMonth','ticker'], inplace=True)\n",
    "\n",
    "    consensus_f1_lag = consensus[consensus.fpi.isin(['{}'.format(i),\n",
    "                                                     '{}'.format(i+1)])][['YearMonth','ticker',\n",
    "                                                                          'fpedats','AF']].copy()\n",
    "\n",
    "    # Forecast in last month: F_{t-1}[x_t]\n",
    "    consensus_f1_lag['YearMonth'] = consensus_f1_lag['YearMonth'] + MonthEnd(1)\n",
    "    consensus_f1_lag.rename(columns={'AF':'AF_l1'},inplace=True)\n",
    "\n",
    "    consensus_f1_change = consensus_f1[['YearMonth','permno','ticker','fpedats','AF']].merge(consensus_f1_lag,\n",
    "                                                                                        on=['YearMonth','ticker','fpedats'],\n",
    "                                                                                        how='left')\n",
    "\n",
    "    consensus_f1_change['FRevision_A{}'.format(i)] = consensus_f1_change['AF'] - \\\n",
    "                                                     consensus_f1_change['AF_l1']\n",
    "    consensus_f1_change.drop_duplicates(subset=['YearMonth','permno'], inplace=True)\n",
    "    result.append(consensus_f1_change[['permno','YearMonth','FRevision_A{}'.format(i)]])\n",
    "\n",
    "FR = reduce(lambda x,y: pd.merge(x,y,on=['permno','YearMonth'], how='outer'), result)\n",
    "FR.sort_values(by=['permno','YearMonth'],inplace=True,ignore_index=True)\n",
    "# Revision in recent 3months\n",
    "FR[['FRevision_A1_3m','FRevision_A2_3m']] = FR.groupby('permno')[['FRevision_A1','FRevision_A2']]\\\n",
    "                                        .rolling(3).sum().reset_index(level=0, drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Earnings Surprise\n",
    "\n",
    "## Most recent Quarterly Earnings Announcement SUE\n",
    "# 1. Actual\n",
    "# IBES actual\n",
    "EPS_true = pd.read_parquet('../data/WRDS/Actual_EPS_detail_unadjusted_2023.parquet')\n",
    "EPS_true['ANNDATS'] = pd.to_datetime(EPS_true['ANNDATS'], format='%Y-%m-%d')\n",
    "EPS_true['YearMonth'] = EPS_true['ANNDATS'] + MonthEnd(0)\n",
    "EPS_true = EPS_true.merge(crsp[['ncusip','YearMonth','shrout','cfacshr','ME']],\n",
    "                          left_on=['CUSIP','YearMonth'],\n",
    "                          right_on=['ncusip','YearMonth'])\n",
    "EPS_true['AE'] = EPS_true['VALUE'] * EPS_true['shrout']\n",
    "EPS_true['YearMonth'] = EPS_true['ANNDATS'] + MonthEnd(-1) # use this to merge with forecast\n",
    "EPS_true_qtr = EPS_true[EPS_true['PDICITY'] == 'QTR'].sort_values(by=['TICKER','PENDS'])\n",
    "# 2. Forecast in last month\n",
    "consensus_1q = consensus[consensus['fpi'] == '6']\n",
    "FE_last = EPS_true_qtr[['TICKER','ANNDATS','YearMonth','PENDS','AE','ME']].merge(consensus_1q[['ticker','YearMonth','fpedats','AF','statpers']],\n",
    "                   left_on=['TICKER','YearMonth','PENDS'],\n",
    "                   right_on=['ticker','YearMonth','fpedats'],\n",
    "                  )\n",
    "FE_last['SUE'] = (FE_last['AE'] - FE_last['AF']) / FE_last['ME']\n",
    "# These SUE shoulbe be used after the announcement\n",
    "FE_last['YearMonth'] = FE_last['ANNDATS'] + MonthEnd(0)\n",
    "FE_last.drop_duplicates(subset=['TICKER','YearMonth'], keep='last', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Momentum\n",
    "crsp = pd.read_parquet('../data/WRDS/crsp_m.parquet')\n",
    "crsp.sort_values(by=['permno','YearMonth'], inplace=True)\n",
    "crsp['ret_12m'] = np.log(1 + crsp['ret']).groupby(crsp['permno'])\\\n",
    "                    .rolling(12).sum()\\\n",
    "                    .reset_index(level=0, drop=True)\n",
    "crsp['ret_12m'] = np.exp(crsp['ret_12m']) - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')\n",
    "df_tmp = df_tmp.merge(compa, on=['gvkey','datadate'], how='left')\n",
    "df_tmp = df_tmp.merge(FR,\n",
    "                      on=['permno','YearMonth'], how='left')\n",
    "df_tmp = df_tmp.merge(consensus_ltg,\n",
    "                      left_on=['TICKER','YearMonth'],\n",
    "                      right_on=['ticker','YearMonth'],\n",
    "                      how='left')\n",
    "df_tmp = df_tmp.merge(FE_last[['TICKER','YearMonth','SUE']],\n",
    "                      on=['TICKER','YearMonth'],\n",
    "                      how='left'\n",
    "                     )\n",
    "df_tmp = df_tmp.merge(crsp[['permno','YearMonth','ret_12m']],\n",
    "                      on=['permno','YearMonth'],\n",
    "                      how='left'\n",
    "                     )\n",
    "df_tmp.sort_values(by=['permno','YearMonth'], inplace=True, ignore_index=True)\n",
    "df_tmp['SUE'] = df_tmp.groupby('permno')['SUE'].ffill(limit=6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp.duplicated(subset=['permno','YearMonth']).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocess and Make Forecasts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp['FRevision_A1_3m_std'] = df_tmp['FRevision_A1_3m']/df_tmp['ME']\n",
    "\n",
    "# winsorization period-by-period\n",
    "cols = [\n",
    "        'acc','LTG', 'sg_5y','dPPE', 'dOLA','ret_12m',\n",
    "        'SUE','FRevision_A1_3m_std'\n",
    "       ]\n",
    "\n",
    "df_tmp[cols] = df_tmp.groupby('YearMonth',group_keys=False)[cols]\\\n",
    "                     .transform(lambda x: x.clip(x.quantile(0.01),x.quantile(0.99)))\n",
    "\n",
    "# ## FillNA with Industry Median\n",
    "fillNA = ['acc','LTG', 'sg_5y','dPPE', 'dOLA','ret_12m',\n",
    "        'SUE','FRevision_A1_3m_std'\n",
    "       ]\n",
    "for v in tqdm(fillNA):\n",
    "    df_tmp[v] = df_tmp.groupby(['YearMonth','fama49'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))\n",
    "## In case some characteristics are all NA in some industry\n",
    "for v in tqdm(fillNA):\n",
    "    df_tmp[v] = df_tmp.groupby(['YearMonth'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Rolling Window:\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "forecast = []\n",
    "\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ['EPS_ana_q1','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()\n",
    "    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'LF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ['EPS_ana_q2','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_q2'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()\n",
    "    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'LF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ['EPS_ana_q3','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_q3'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()\n",
    "    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'LF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ['EPS_ana_y1','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_y1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()\n",
    "    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'LF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ['EPS_ana_y2','acc','LTG', 'sg_5y','dPPE', 'dOLA','SUE','ret_12m','FRevision_A1_3m_std']\n",
    "    y_col = 'EPS_true_y2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(60)) & (df_tmp['ANNDATS_y2'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    mdl = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()\n",
    "    y_pred = mdl.predict(sm.add_constant(df_test[x_cols]))\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'LF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forecast_all = pd.concat(forecast,axis=0).reset_index()\n",
    "forecast_all.to_parquet('../data/Results/Hughes_eps.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# So (2013)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')\n",
    "# compustat data\n",
    "compa = pd.read_parquet('../data/WRDS/compa.parquet')\n",
    "compa['gvkey'] = compa['gvkey'].astype(float)\n",
    "compa['datadate'] = compa['datadate'] + MonthEnd(0)\n",
    "compa = compa[compa['at_avg'] > 0].copy()\n",
    "compa['acc'] = compa['acc']/compa['csho']\n",
    "compa['div'] = compa['dvc']/compa['csho']\n",
    "compa['dd'] = np.where(compa['dvc']==0, 1, 0)\n",
    "compa['acc_n'] = compa['acc'].clip(upper=0).abs()\n",
    "compa['acc_p'] = compa['acc'].clip(lower=0)\n",
    "\n",
    "df_tmp = df_tmp.merge(compa[['gvkey','datadate','dd','div',\n",
    "                             'acc_n','acc_p','ag',\n",
    "                            ]],\n",
    "                        left_on=['gvkey','adate'],\n",
    "                        right_on=['gvkey','datadate'],\n",
    "                        how='left')\n",
    "\n",
    "df_tmp['earnings_pos_l1_y1'] = df_tmp['EPS_true_l1_y1'].clip(lower=0)\n",
    "df_tmp['nege_l1_y1'] = np.where(df_tmp['EPS_true_l1_y1'] < 0, 1, 0)\n",
    "\n",
    "df_tmp['earnings_pos_l1_q1'] = df_tmp['EPS_true_l1_q1'].clip(lower=0)\n",
    "df_tmp['nege_l1_q1'] = np.where(df_tmp['EPS_true_l1_q1'] < 0, 1, 0)\n",
    "\n",
    "## winsorize period-by-period\n",
    "cols = ['earnings_pos_l1_y1','earnings_pos_l1_q1',\n",
    "        'acc_n','acc_p','ag','div',\n",
    "       ]\n",
    "df_tmp[cols] = df_tmp.groupby('YearMonth')[cols]\\\n",
    "                    .transform(lambda x: x.clip(x.quantile(0.01),x.quantile(0.99)))\n",
    "\n",
    "# ## FillNA with Industry Median\n",
    "fillNA = ['earnings_pos_l1_q1','nege_l1_q1',\n",
    "          'earnings_pos_l1_y1','nege_l1_y1',\n",
    "          'acc_n','acc_p','ag','dd','bm','prc','div']\n",
    "for v in tqdm(fillNA):\n",
    "    df_tmp[v] = df_tmp.groupby(['YearMonth','fama49'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))\n",
    "## In case some characteristics are all NA in some industry\n",
    "for v in tqdm(fillNA):\n",
    "    df_tmp[v] = df_tmp.groupby(['YearMonth'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Forecasts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Rolling Window:\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "\n",
    "params = {}\n",
    "t_values = {}\n",
    "\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ['EPS_ana_q1','earnings_pos_l1_q1','nege_l1_q1','acc_n','acc_p','ag','dd','bm','prc','div']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = LinearRegression()\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'So_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ['EPS_ana_q2','earnings_pos_l1_q1','nege_l1_q1','acc_n','acc_p','ag','dd','bm','prc','div']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = LinearRegression()\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'So_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ['EPS_ana_q3','earnings_pos_l1_q1','nege_l1_q1','acc_n','acc_p','ag','dd','bm','prc','div']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = LinearRegression()\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'So_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ['EPS_ana_y1','earnings_pos_l1_y1','nege_l1_y1','acc_n','acc_p','ag','dd','bm','prc','div']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(12)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = LinearRegression()\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    mdl_sm = sm.OLS(df_train[y_col], sm.add_constant(df_train[x_cols])).fit()\n",
    "    params[t] = mdl_sm.params\n",
    "    t_values[t] = mdl_sm.tvalues\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'So_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ['EPS_ana_y2','earnings_pos_l1_y1','nege_l1_y1','acc_n','acc_p','ag','dd','bm','prc','div']\n",
    "    y_col = 'EPS_true_y2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = LinearRegression()\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'So_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet('../data/Results/So_eps_AF.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "man_versus_machine",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
